{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "2e25cf66",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import uuid\n",
    "import pickle\n",
    "\n",
    "import pandas as pd\n",
    "\n",
    "import mlflow\n",
    "\n",
    "from sklearn.feature_extraction import DictVectorizer\n",
    "from sklearn.ensemble import RandomForestRegressor\n",
    "from sklearn.metrics import mean_squared_error\n",
    "from sklearn.pipeline import make_pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "5887deea",
   "metadata": {},
   "outputs": [],
   "source": [
    "year = 2021\n",
    "month = 2\n",
    "taxi_type = 'green'\n",
    "\n",
    "input_file = f'https://s3.amazonaws.com/nyc-tlc/trip+data/{taxi_type}_tripdata_{year:04d}-{month:02d}.parquet'\n",
    "output_file = f'output/{taxi_type}/{year:04d}-{month:02d}.parquet'\n",
    "\n",
    "RUN_ID = os.getenv('RUN_ID', 'e1efc53e9bd149078b0c12aeaa6365df')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "b9666e19",
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_uuids(n):\n",
    "    ride_ids = []\n",
    "    for i in range(n):\n",
    "        ride_ids.append(str(uuid.uuid4()))\n",
    "    return ride_ids\n",
    "\n",
    "def read_dataframe(filename: str):\n",
    "    df = pd.read_parquet(filename)\n",
    "\n",
    "    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime\n",
    "    df.duration = df.duration.dt.total_seconds() / 60\n",
    "    df = df[(df.duration >= 1) & (df.duration <= 60)]\n",
    "    \n",
    "    df['ride_id'] = generate_uuids(len(df))\n",
    "\n",
    "    return df\n",
    "\n",
    "\n",
    "def prepare_dictionaries(df: pd.DataFrame):\n",
    "    categorical = ['PULocationID', 'DOLocationID']\n",
    "    df[categorical] = df[categorical].astype(str)\n",
    "    \n",
    "    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']\n",
    "\n",
    "    categorical = ['PU_DO']\n",
    "    numerical = ['trip_distance']\n",
    "    dicts = df[categorical + numerical].to_dict(orient='records')\n",
    "    return dicts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "6b5f0d80",
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_model(run_id):\n",
    "    logged_model = f's3://mlflow-models-alexey/1/{RUN_ID}/artifacts/model'\n",
    "    model = mlflow.pyfunc.load_model(logged_model)\n",
    "    return model\n",
    "\n",
    "\n",
    "def apply_model(input_file, run_id, output_file):\n",
    "\n",
    "    df = read_dataframe(input_file)\n",
    "    dicts = prepare_dictionaries(df)\n",
    "\n",
    "    \n",
    "    model = load_model(run_id)\n",
    "    y_pred = model.predict(dicts)\n",
    "\n",
    "    df_result = pd.DataFrame()\n",
    "    df_result['ride_id'] = df['ride_id']\n",
    "    df_result['lpep_pickup_datetime'] = df['lpep_pickup_datetime']\n",
    "    df_result['PULocationID'] = df['PULocationID']\n",
    "    df_result['DOLocationID'] = df['DOLocationID']\n",
    "    df_result['actual_duration'] = df['duration']\n",
    "    df_result['predicted_duration'] = y_pred\n",
    "    df_result['diff'] = df_result['actual_duration'] - df_result['predicted_duration']\n",
    "    df_result['model_version'] = run_id\n",
    "    \n",
    "    df_result.to_parquet(output_file, index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "cc2899e7",
   "metadata": {},
   "outputs": [],
   "source": [
    "apply_model(input_file=input_file, run_id=RUN_ID, output_file=output_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "b75bd6c9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2021-02.parquet  2021-03.parquet\r\n"
     ]
    }
   ],
   "source": [
    "!ls output/green/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "289d4bc4",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
